/*******************************************************************************
																				
	DESCRIPTION: 	This do file creates the programs that clean the predictions
					coming from R, obtains the weigths for the algorithms, does
					the final interpolation step and produced the final Ensemble
					predictions.
	
*******************************************************************************/

global id_code 003

*******************************************************************************
** Creating ensemble predictions - combining the three different predictions **
*******************************************************************************
cap program drop cleanpred
program cleanpred
		
	syntax, model(string) [submodel(string)] year(string) outcomes(string) ///
		[model_sample(string)] [year_sample(string)] [outcomes_sample(string)] ///
		[name(string)] [surname(string)] [positive] [w_sample(real `= 1/6')] [w_if(string)] [spline(real 0)]
	
	qui {
		* Process optional arguments:
		if "`model_sample'" == "" {
			local model_sample `model'
		}
		
		if "`year_sample'" == "" {
			local year_sample `year'
		}
		
		if "`outcomes_sample'" == "" {
			local outcomes_sample `outcomes'
		}	
		
		if "`w_if'" != "" {
			local w_if `"w_if("`w_if'")"'
		}
			
		local K: word count `outcomes'
		local L: word count `outcomes_sample'
		
		if `K' != `L' {
			noisily di "ERROR: outcome vectors are not the same length."
			exit
		}
			
			
		* Loop over all outcome variable combinations:
		forval k = 1/`K' {
						
			local outcome: word `k' of `outcomes'
			local outcome_sample: word `k' of `outcomes_sample'
			
			noisily di "--> Outcome `k'/`K': `outcome'"
			
			* Compute calibration:
			noisily calibrate, model(`model') submodel(`submodel') outcome(`outcome') ///
				year(`year') w_sample(`w_sample') `w_if' `positive' spline(`spline')
			
			* Apply calibration to sample:
			noisily apply_calibration, model(`model') submodel(`submodel') outcome(`outcome') year(`year') ///
				model_sample(`model_sample') outcome_sample(`outcome_sample') year_sample(`year_sample') ///
				name(`name') w_sample(`w_sample') spline(`spline')
		
			* Save predictions
			tempfile t`k'
			save `t`k''
		
		}
		
		*******************************************************************************
		* Putting the predictions for different outcomes together
		*******************************************************************************	
		local i 1
		forval k = 1/`K' {
			
			local outcome: word `k' of `outcomes'
			local outcome_sample: word `k' of `outcomes_sample'
			
			if `i++' == 1 {
				use `t`k'', clear
			}
			
			else {
				merge 1:1 n using  `t`k'', nogen
			}
				
		}
		
		* Save data for each model and set of variables separately:
		compress
		save "$data/${id_code}_PredEnsemble_`model_sample'`submodel'_`year_sample'`name'`surname'.dta", replace
		
		*******************************************************************************
		* Merge the predictions with outcome variables and info on unempl spell *
		*******************************************************************************
					
		* Get the outcome variables from 002 dataset
		use Lop* InLnr n inSample emplAft* using "${data}/002_DataForR_`model_sample'_`year_sample'.dta", clear
		
		* Merge with the predictions
		merge 1:1 n ///
			using "$data/${id_code}_PredEnsemble_`model_sample'`submodel'_`year_sample'`name'`surname'.dta" ///
			, assert(1 3) /// there should be no _merge==2, there will be some _merge==1 which are observations used to train the model, obtain weights and interpolate
			keep(3) nogen

		* Merging back to main data to obtain duration of unemployment spells
		merge 1:1 LopNr_PersonNr InLnr ///
			using "$data/001_9_FinalMainDataset.dta" ///
			, keepusing(year duration startU trueEnd) ///
			assert(2 3) /// there should be no _merge==1
			keep(3) nogen
			
		/* Ensure that we don't have predictions without a corresponding Y value. 
		We do this because we might have individuals for whom predictions were made 
		in two years (for both years we have Xs), but whose outcome was non-missing 
		in one year only. In the next step, when we combine the observations to ensure 
		that there is only one observation per spell, we do not want to have two 
		different predictions for the same person */	

		forval k = 1/`K' {
			
			local outcome: word `k' of `outcomes'
			local outcome_sample: word `k' of `outcomes_sample'
			local pred_sample p_`outcome_sample'
			
			if "`outcome'" != "`outcome_sample'" {
				local dur = substr("`outcome'", 11, 2)
				if "`dur'" == "12" {
					local dur "12M"
				}
				local pred_sample p_`outcome_sample'_`dur'_Mod
			} 
			
			assert !missing(`outcome_sample') if !missing(`pred_sample')
			
		}
		
		order LopNr_PersonNr InLnr year, first
		compress
		
		save "$data/${id_code}_MainWithEnsemblePred_`model_sample'`submodel'_`year_sample'`name'`surname'.dta", replace
		
	}
	
end


cap program drop calibrate
program calibrate
		
	syntax, model(string) [submodel(string)] year(string) outcome(string) ///
		[name(string)] [positive] [w_sample(real `= 1/6')] [spline(real 0)] ///
		[w_if(string)]
		
	qui {
		* Set number of groups for final interpolation step (if wanted):
		global num_groups = 250 
			
		* Generate local with the name of the outcomes:
		local pred p_`outcome'

		* Process optional if argument:
		if "`w_if'" != "" {
			local w_if "& `w_if'"
		}
		
		*******************************************************************************
		***********************  Importing and cleaning data 	***********************
		*******************************************************************************
		
		* Importing data
		import delimited "$data/103_predictionsR_`model'`submodel'_`outcome'_`year'.csv", clear case(preserve)
		
		* Cleaning data
		if "`model'" != "Full_NoTraining" & "`model'" != "Full_NoRecalls" {
			gen `outcome' = total_y == 1
			rename prob_rf2 `pred'_rf
			rename prob_boost `pred'_boost
			rename s0 `pred'_lasso
			rename V2 n
			rename V1 LopNr_PersonNr
			order n `outcome' `pred'_rf `pred'_boost `pred'_lasso , first

			* Keeping only relevant variables
			drop v1 total_y
		}
		else {
			gen `outcome'_temp = `outcome' == "yes"
			drop `outcome'
			rename `outcome'_temp `outcome'
		}
		
		order n `outcome' `pred'_rf `pred'_boost `pred'_lasso , first

		
		*******************************************************************************
		***********************  Calculating weights  	*****************************
		*******************************************************************************
		
		* Set aside part of the sample to estimate the weights:
		gen weights = 0
		sum weights
		replace weights = 1 if _n < round(r(N) * `w_sample', 1)
		
		* Weights are obtained by running a constrained regression. The 
		* constraint is that weights need to add up to 1 and, optionally, that
		* they are positive:
		if "`positive'" == "" {
			constraint 1 `pred'_rf + `pred'_boost + `pred'_lasso = 1
			
			cnsreg `outcome' `pred'_rf `pred'_boost ///
			`pred'_lasso if weights == 1 `w_if', nocons constraints(1)
			
			* Save weights as locals: 
			local w_rf = _b[`pred'_rf]
			local w_boost = _b[`pred'_boost]
			local w_lasso = _b[`pred'_lasso]
			
		}
		else if "`positive'" == "positive" {
			* If positive option specified, we constraint weights to sum to one
			* using an exponential transformation and the non-linear least squares
			* estimation command nl:
			nl (`outcome' = ///
				exp({a1}) / exp(1 + {a1} + {a2}) * `pred'_rf + ///
				exp({a2}) / exp(1 + {a1} + {a2}) * `pred'_boost + ///
				exp(1) / exp(1 + {a1} + {a2}) * `pred'_lasso) if weights == 1

			* Convert back to weights and save as locals: 
			local w_rf = exp(_b[a1:_cons]) / exp(1 + _b[a1:_cons] + _b[a2:_cons])
			local w_boost =  exp(_b[a2:_cons]) / exp(1 + _b[a1:_cons] + _b[a2:_cons])
			local w_lasso = exp(1) / exp(1 + _b[a1:_cons] + _b[a2:_cons])
			
		}
		
		* Print:
		noisily di "-----> Computed weigths (`: di %5.2f `=100*`w_sample''' of remaining sample): `: di %5.3f `w_rf'' * RF + `: di %5.3f `w_boost'' * BOOST + `: di %5.3f `w_lasso'' * LASSO"
		
		* Save weights in a frame and export: 
		tempname weights
		frame create `weights' str30(model) str30(year) str30(outcome) /// 
			weight_rf weight_boost weight_lasso
		
		frame post `weights' ("`model'") ("`year'") ("`outcome'") ///
			(`w_rf') (`w_boost') (`w_lasso')
			
		frame `weights': save "$data/${id_code}_Weights_`model'`submodel'_`outcome'_`year'.dta", replace
		

		*******************************************************************************
		************* Generating a weighted average of predictions 	*******************
		*******************************************************************************
		
		gen `pred' = ///
			(`w_rf' * `pred'_rf) + (`w_boost' * `pred'_boost) + (`w_lasso' * `pred'_lasso)
		
		label var `pred' ///
			"Pred. JFR in `months' months, after `months' months unempl. (ensemble - `model')"
		
		* Keep only those observations that were not used to calculate weights
		keep if weights == 0 
		keep n `pred'* `outcome'

		*******************************************************************************
		************* Creating means for the final calibration 	*******************
		*******************************************************************************
		
		if `spline' != 0 {
			
			noisily di "-----> Computing spline (`: di %5.2f `=100*`spline''' of remaining sample)"
			* Use part of the sample to perform the final calibration - interpolation
			sum n
			keep if _n < round(r(N) * `spline', 1)
			
			* Split the data into k groups, k defined as a global at the 
			* start of this code
			egen groups = cut(`pred'), group(${num_groups})
			
			* Calculate the mean predicted and observed JFR in each group
			collapse (mean) int_`outcome' = `outcome' `pred', by(groups)
			
			* Save:
			save "$data/${id_code}_Means_`model'`submodel'_`outcome'_`year'.dta", replace

			
		}
			
	}
	
end


cap program drop apply_calibration
program apply_calibration
		
	syntax, model(string) [submodel(string)] year(string) outcome(string) ///
		[model_sample(string)] [year_sample(string)] [outcome_sample(string)] ///
		[name(string)] [w_sample(real `= 1/6')] [spline(real 0)]
		
	qui {
		* Process optional arguments:
		if "`model_sample'" == "" {
			local model_sample `model'
		}
		
		if "`year_sample'" == "" {
			local year_sample `year'
		}
		
		if "`outcome_sample'" == "" {
			local outcome_sample `outcome'
		}
		
		* Add names for predictions:
		local pred p_`outcome'
		local pred_sample p_`outcome_sample'
								
		* Importing weights:
		use "$data/${id_code}_Weights_`model'`submodel'_`outcome'_`year'.dta", clear

		* Save to locals:
		local w_rf = weight_rf[1]
		local w_boost = weight_boost[1]
		local w_lasso = weight_lasso[1]

		noisily di "-----> Using weigths: `: di %5.3f `w_rf'' * RF + `: di %5.3f `w_boost'' * BOOST + `: di %5.3f `w_lasso'' * LASSO"
		
		* Importing data
		if "`outcome'" == "`outcome_sample'" {
			import delimited "$data/103_predictionsR_`model_sample'`submodel'_`outcome_sample'_`year_sample'`name'.csv", clear case(preserve)
			noisily di "Using sample from 103_predictionsR_`model_sample'`submodel'_`outcome_sample'_`year_sample'`name'.csv"
		} 
		else {
			import delimited "$data/103_predictionsR_`model_sample'`submodel'_`outcome_sample'_`year_sample'_`outcome'_Model.csv", clear case(preserve)
			noisily di "Using sample from 103_predictionsR_`model_sample'`submodel'_`outcome_sample'_`year_sample'_`outcome'_Model.csv"
		}
		pause
		
		* Cleaning data
		if "`model'" != "Full_NoTraining" & "`model'" != "Full_NoRecalls" {
			gen `outcome_sample' = total_y == 1
			rename prob_rf2 `pred_sample'_rf
			rename prob_boost `pred_sample'_boost
			rename s0 `pred_sample'_lasso
			rename V2 n
			rename V1 LopNr_PersonNr
			order n `outcome_sample' `pred_sample'_rf `pred_sample'_boost `pred_sample'_lasso , first

			* Keeping only relevant variables
			drop v1 total_y
		}
		else {
			gen `outcome_sample'_temp = `outcome_sample' == "yes"
			drop `outcome_sample'
			rename `outcome_sample'_temp `outcome_sample'
		}
			
		* Keep sample not used for weights or the spline:
		if `spline' == 0 {
			sum n
			drop if _n < round(r(N) * `w_sample', 1)
		}
		else {
			sum n
			drop if _n < round(r(N) * `w_sample', 1)
			sum n
			drop if _n < round(r(N) * `spline', 1)
		}	
			
		*******************************************************************************
		************* Generating a weighted average of predictions 	*******************
		*******************************************************************************
		
		gen `pred_sample' = ///
			(`w_rf' * `pred_sample'_rf) + (`w_boost' * `pred_sample'_boost) + (`w_lasso' * `pred_sample'_lasso)
		
		pause
		if "`model'" == "Full_NoTraining" {
			keep n `pred_sample'* `outcome_sample' training_combined_6months	
		}
		else if "`model'" == "Full_NoRecalls" {
			keep n `pred_sample'* `outcome_sample' recalled
		}
		else {
			keep n `pred_sample'* `outcome_sample'	
		}
			
		*******************************************************************************
		************* Performing the final calibration 	*******************
		*******************************************************************************		
				
		if `spline' != 0  {
			* Append the averages in k groups from the model trained Y months in:
			rename `pred_sample' `pred'
			append using "$data/${id_code}_Means_`model'`submodel'_`outcome'_`year'.dta"
			noisily di "Using means from ${id_code}_Means_`model'_`outcome'_`year'.dta"
			rename `pred' `pred_sample'
			pause
			
			* Take the predicted probability and linearly interpolate from the 
			* 250 prediction-outcome pairs to generate a new prediction for each 
			* variable
			ipolate int_`outcome' `pred_sample', gen(`pred_sample'_f) epolate
			pause
			* Drop 250 average of binned observations
			drop if n == .

			* Drop old version of the prediction and other no longer needed 
			* variables, rename the new version of the prediction
			drop int_`outcome' `pred_sample' groups
			rename `pred_sample'_f `pred_sample'
			
		}
		
		* Set all negative predictions to 0 and all predictions above 1 to 1
		replace `pred_sample' = 0 if `pred_sample' < 0 & `pred_sample' != .
		replace `pred_sample' = 1 if `pred_sample' > 1 & `pred_sample' != .
			
		*******************************************************************************
		************* Save final predictions 	*******************
		******************************************************************************
		* Rename prediction variable if needed:
		if "`outcome'" != "`outcome_sample'" {
			local dur = substr("`outcome'", 11, 2)
			if "`dur'" == "12" {
				local dur "12M"
			}
			rename (`pred_sample' `pred_sample'_rf `pred_sample'_boost `pred_sample'_lasso) ///
				(p_`outcome_sample'_`dur'_Mod `pred_sample'_`dur'_Mod_rf `pred_sample'_`dur'_Mod_boost `pred_sample'_`dur'_Mod_lasso)

		} 
		
		* Drop predictions from individual models
		if "`keep'" == "nokeep" {
			drop `pred_sample'_rf `pred_sample'_boost `pred_sample'_lasso 
		}
		
		* Drop empirical JFR 
		drop `outcome_sample'
	}
	
end

